In [1]:
lc_data = pd.DataFrame.from_csv('./lc_dataframe(cleaning).csv')
lc_data = lc_data.reset_index()
lc_data.tail()
Out[1]:
loan_amnt
term
int_rate
installment
grade
sub_grade
emp_title
emp_length
home_ownership
annual_inc
...
dti
delinq_2yrs
inq_last_6mths
open_acc
pub_rec
revol_bal
revol_util
total_acc
initial_list_status
acc_now_delinq
268131
31050
60
21.99
857.40
6
61
1
10
1
875000.0
...
9.66
1
0
10
0
25770
79.3
13
0
0
268132
10800
36
7.89
337.89
1
15
1
8
1
92400.0
...
19.62
1
0
11
0
9760
68.7
36
1
0
268133
9000
36
9.17
286.92
2
22
1
1
1
80000.0
...
3.97
1
0
8
0
6320
51.8
17
0
0
268134
14400
60
25.99
431.06
6
65
0
11
5
62000.0
...
16.88
0
1
9
1
5677
45.1
30
0
0
268135
8000
36
12.59
267.98
3
32
1
4
4
45000.0
...
26.21
0
0
12
0
9097
50.8
47
1
0
5 rows × 25 columns
In [2]:
y = lc_data['loan_status']
In [3]:
lc_data = lc_data.drop('loan_status', 1)
In [4]:
lc_data = pd.concat([lc_data, y], axis = 1)
In [5]:
lc_data.tail()
Out[5]:
loan_amnt
term
int_rate
installment
grade
sub_grade
emp_title
emp_length
home_ownership
annual_inc
...
delinq_2yrs
inq_last_6mths
open_acc
pub_rec
revol_bal
revol_util
total_acc
initial_list_status
acc_now_delinq
loan_status
268131
31050
60
21.99
857.40
6
61
1
10
1
875000.0
...
1
0
10
0
25770
79.3
13
0
0
1
268132
10800
36
7.89
337.89
1
15
1
8
1
92400.0
...
1
0
11
0
9760
68.7
36
1
0
1
268133
9000
36
9.17
286.92
2
22
1
1
1
80000.0
...
1
0
8
0
6320
51.8
17
0
0
1
268134
14400
60
25.99
431.06
6
65
0
11
5
62000.0
...
0
1
9
1
5677
45.1
30
0
0
1
268135
8000
36
12.59
267.98
3
32
1
4
4
45000.0
...
0
0
12
0
9097
50.8
47
1
0
1
5 rows × 25 columns
In [6]:
lc_data = sm.add_constant(lc_data)
model_data = sm.OLS(lc_data.ix[:, -1], lc_data.ix[:, :-1])
result_data = model_data.fit()
result_data
print(result_data.summary())
OLS Regression Results
==============================================================================
Dep. Variable: loan_status R-squared: 0.092
Model: OLS Adj. R-squared: 0.092
Method: Least Squares F-statistic: 1129.
Date: Thu, 02 Mar 2017 Prob (F-statistic): 0.00
Time: 06:44:50 Log-Likelihood: -1.3353e+05
No. Observations: 268136 AIC: 2.671e+05
Df Residuals: 268111 BIC: 2.674e+05
Df Model: 24
Covariance Type: nonrobust
=======================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------
const 1.1595 0.008 141.853 0.000 1.143 1.176
loan_amnt 2.347e-06 6.94e-07 3.380 0.001 9.86e-07 3.71e-06
term -0.0041 0.000 -21.364 0.000 -0.004 -0.004
int_rate -0.0018 0.001 -2.299 0.022 -0.003 -0.000
installment -0.0001 2.16e-05 -5.863 0.000 -0.000 -8.43e-05
grade 0.0412 0.006 6.852 0.000 0.029 0.053
sub_grade -0.0088 0.001 -11.863 0.000 -0.010 -0.007
emp_title 0.0878 0.004 25.043 0.000 0.081 0.095
emp_length 0.0005 0.000 2.041 0.041 1.79e-05 0.001
home_ownership -0.0086 0.000 -20.104 0.000 -0.009 -0.008
annual_inc 2.267e-07 1.52e-08 14.904 0.000 1.97e-07 2.56e-07
verification_status -0.0061 0.002 -3.347 0.001 -0.010 -0.003
issue_d 0.0020 0.000 8.664 0.000 0.002 0.002
desc 6.036e-05 3.81e-06 15.828 0.000 5.29e-05 6.78e-05
purpose -0.0033 0.000 -10.346 0.000 -0.004 -0.003
dti -0.0051 0.000 -45.055 0.000 -0.005 -0.005
delinq_2yrs -0.0146 0.001 -13.987 0.000 -0.017 -0.013
inq_last_6mths -0.0077 0.001 -9.989 0.000 -0.009 -0.006
open_acc -0.0035 0.000 -15.719 0.000 -0.004 -0.003
pub_rec -0.0052 0.002 -2.946 0.003 -0.009 -0.002
revol_bal 4.371e-07 4.72e-08 9.252 0.000 3.45e-07 5.3e-07
revol_util -0.0005 3.7e-05 -12.387 0.000 -0.001 -0.000
total_acc 0.0027 9.34e-05 28.426 0.000 0.002 0.003
initial_list_status -0.0125 0.002 -7.097 0.000 -0.016 -0.009
acc_now_delinq -0.0295 0.012 -2.420 0.016 -0.053 -0.006
==============================================================================
Omnibus: 38870.087 Durbin-Watson: 1.993
Prob(Omnibus): 0.000 Jarque-Bera (JB): 58406.052
Skew: -1.140 Prob(JB): 0.00
Kurtosis: 2.816 Cond. No. 1.51e+06
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.51e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
In [7]:
Conti_Variable_data = lc_data.drop('grade', 1)
Conti_Variable_data = lc_data.drop('sub_grade', 1)
Conti_Variable_data = lc_data.drop('emp_length', 1)
Conti_Variable_data = lc_data.drop('home_ownership', 1)
Conti_Variable_data = lc_data.drop('verification_status', 1)
Conti_Variable_data = lc_data.drop('issue_d', 1)
Conti_Variable_data = lc_data.drop('purpose', 1)
Conti_Variable_data = lc_data.drop('initial_list_status', 1)
In [8]:
Conti_Variable_data.tail()
Out[8]:
const
loan_amnt
term
int_rate
installment
grade
sub_grade
emp_title
emp_length
home_ownership
...
dti
delinq_2yrs
inq_last_6mths
open_acc
pub_rec
revol_bal
revol_util
total_acc
acc_now_delinq
loan_status
268131
1.0
31050
60
21.99
857.40
6
61
1
10
1
...
9.66
1
0
10
0
25770
79.3
13
0
1
268132
1.0
10800
36
7.89
337.89
1
15
1
8
1
...
19.62
1
0
11
0
9760
68.7
36
0
1
268133
1.0
9000
36
9.17
286.92
2
22
1
1
1
...
3.97
1
0
8
0
6320
51.8
17
0
1
268134
1.0
14400
60
25.99
431.06
6
65
0
11
5
...
16.88
0
1
9
1
5677
45.1
30
0
1
268135
1.0
8000
36
12.59
267.98
3
32
1
4
4
...
26.21
0
0
12
0
9097
50.8
47
0
1
5 rows × 25 columns
In [9]:
Conti_Variable_data = sm.add_constant(Conti_Variable_data)
model_data2 = sm.OLS(Conti_Variable_data.ix[:, -1], Conti_Variable_data.ix[:, :-1])
result_data2 = model_data2.fit()
result_data2
print(result_data2.summary())
OLS Regression Results
==============================================================================
Dep. Variable: loan_status R-squared: 0.092
Model: OLS Adj. R-squared: 0.092
Method: Least Squares F-statistic: 1176.
Date: Thu, 02 Mar 2017 Prob (F-statistic): 0.00
Time: 06:44:52 Log-Likelihood: -1.3356e+05
No. Observations: 268136 AIC: 2.672e+05
Df Residuals: 268112 BIC: 2.674e+05
Df Model: 23
Covariance Type: nonrobust
=======================================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------
const 1.1558 0.008 141.676 0.000 1.140 1.172
loan_amnt 2.22e-06 6.94e-07 3.197 0.001 8.59e-07 3.58e-06
term -0.0041 0.000 -21.426 0.000 -0.004 -0.004
int_rate -0.0013 0.001 -1.699 0.089 -0.003 0.000
installment -0.0001 2.16e-05 -5.713 0.000 -0.000 -8.11e-05
grade 0.0433 0.006 7.215 0.000 0.032 0.055
sub_grade -0.0091 0.001 -12.384 0.000 -0.011 -0.008
emp_title 0.0875 0.004 24.945 0.000 0.081 0.094
emp_length 0.0004 0.000 1.953 0.051 -1.54e-06 0.001
home_ownership -0.0086 0.000 -20.076 0.000 -0.009 -0.008
annual_inc 2.253e-07 1.52e-08 14.813 0.000 1.95e-07 2.55e-07
verification_status -0.0060 0.002 -3.338 0.001 -0.010 -0.002
issue_d 0.0020 0.000 8.745 0.000 0.002 0.002
desc 6.578e-05 3.74e-06 17.604 0.000 5.85e-05 7.31e-05
purpose -0.0031 0.000 -9.998 0.000 -0.004 -0.003
dti -0.0051 0.000 -45.365 0.000 -0.005 -0.005
delinq_2yrs -0.0147 0.001 -14.151 0.000 -0.017 -0.013
inq_last_6mths -0.0076 0.001 -9.861 0.000 -0.009 -0.006
open_acc -0.0035 0.000 -15.835 0.000 -0.004 -0.003
pub_rec -0.0059 0.002 -3.330 0.001 -0.009 -0.002
revol_bal 4.397e-07 4.72e-08 9.307 0.000 3.47e-07 5.32e-07
revol_util -0.0005 3.7e-05 -12.232 0.000 -0.001 -0.000
total_acc 0.0027 9.34e-05 28.373 0.000 0.002 0.003
acc_now_delinq -0.0300 0.012 -2.458 0.014 -0.054 -0.006
==============================================================================
Omnibus: 38890.927 Durbin-Watson: 1.992
Prob(Omnibus): 0.000 Jarque-Bera (JB): 58452.233
Skew: -1.140 Prob(JB): 0.00
Kurtosis: 2.817 Cond. No. 1.51e+06
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.51e+06. This might indicate that there are
strong multicollinearity or other numerical problems.
In [ ]:
Content source: shinys825/lc_project
Similar notebooks: